tutorials/008 - Redshift - Copy & Unload.ipynb (575 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[](https://github.com/aws/aws-sdk-pandas)\n",
"\n",
"# 8 - Redshift - COPY & UNLOAD\n",
"\n",
"`Amazon Redshift` has two SQL command that help to load and unload large amount of data staging it on `Amazon S3`:\n",
"\n",
"1 - [COPY](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html)\n",
"\n",
"2 - [UNLOAD](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html)\n",
"\n",
"Let's take a look and how awswrangler can use it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install the optional modules first\n",
"!pip install 'awswrangler[redshift]'"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import awswrangler as wr\n",
"\n",
"con = wr.redshift.connect(\"aws-sdk-pandas-redshift\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enter your bucket name:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ···········································\n"
]
}
],
"source": [
"import getpass\n",
"\n",
"bucket = getpass.getpass()\n",
"path = f\"s3://{bucket}/stage/\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enter your IAM ROLE ARN:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ····················································································\n"
]
}
],
"source": [
"iam_role = getpass.getpass()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Creating a DataFrame from the NOAA's CSV files\n",
"\n",
"[Reference](https://registry.opendata.aws/noaa-ghcn/)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>dt</th>\n",
" <th>element</th>\n",
" <th>value</th>\n",
" <th>m_flag</th>\n",
" <th>q_flag</th>\n",
" <th>s_flag</th>\n",
" <th>obs_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AG000060590</td>\n",
" <td>1897-01-01</td>\n",
" <td>TMAX</td>\n",
" <td>170</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>E</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AG000060590</td>\n",
" <td>1897-01-01</td>\n",
" <td>TMIN</td>\n",
" <td>-14</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>E</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AG000060590</td>\n",
" <td>1897-01-01</td>\n",
" <td>PRCP</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>E</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AGE00135039</td>\n",
" <td>1897-01-01</td>\n",
" <td>TMAX</td>\n",
" <td>140</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>E</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AGE00135039</td>\n",
" <td>1897-01-01</td>\n",
" <td>TMIN</td>\n",
" <td>40</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>E</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923594</th>\n",
" <td>UZM00038457</td>\n",
" <td>1897-12-31</td>\n",
" <td>TMIN</td>\n",
" <td>-145</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>r</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923595</th>\n",
" <td>UZM00038457</td>\n",
" <td>1897-12-31</td>\n",
" <td>PRCP</td>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>r</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923596</th>\n",
" <td>UZM00038457</td>\n",
" <td>1897-12-31</td>\n",
" <td>TAVG</td>\n",
" <td>-95</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>r</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923597</th>\n",
" <td>UZM00038618</td>\n",
" <td>1897-12-31</td>\n",
" <td>PRCP</td>\n",
" <td>66</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>r</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923598</th>\n",
" <td>UZM00038618</td>\n",
" <td>1897-12-31</td>\n",
" <td>TAVG</td>\n",
" <td>-45</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>r</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3923599 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" id dt element value m_flag q_flag s_flag obs_time\n",
"0 AG000060590 1897-01-01 TMAX 170 NaN NaN E NaN\n",
"1 AG000060590 1897-01-01 TMIN -14 NaN NaN E NaN\n",
"2 AG000060590 1897-01-01 PRCP 0 NaN NaN E NaN\n",
"3 AGE00135039 1897-01-01 TMAX 140 NaN NaN E NaN\n",
"4 AGE00135039 1897-01-01 TMIN 40 NaN NaN E NaN\n",
"... ... ... ... ... ... ... ... ...\n",
"3923594 UZM00038457 1897-12-31 TMIN -145 NaN NaN r NaN\n",
"3923595 UZM00038457 1897-12-31 PRCP 4 NaN NaN r NaN\n",
"3923596 UZM00038457 1897-12-31 TAVG -95 NaN NaN r NaN\n",
"3923597 UZM00038618 1897-12-31 PRCP 66 NaN NaN r NaN\n",
"3923598 UZM00038618 1897-12-31 TAVG -45 NaN NaN r NaN\n",
"\n",
"[3923599 rows x 8 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols = [\"id\", \"dt\", \"element\", \"value\", \"m_flag\", \"q_flag\", \"s_flag\", \"obs_time\"]\n",
"\n",
"df = wr.s3.read_csv(\n",
" path=\"s3://noaa-ghcn-pds/csv/by_year/1897.csv\", names=cols, parse_dates=[\"dt\", \"obs_time\"]\n",
") # ~127MB, ~4MM rows\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and Unload with COPY and UNLOAD commands\n",
"\n",
"> Note: Please use a empty S3 path for the COPY command."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.78 s, sys: 293 ms, total: 3.08 s\n",
"Wall time: 20.7 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"wr.redshift.copy(\n",
" df=df,\n",
" path=path,\n",
" con=con,\n",
" schema=\"public\",\n",
" table=\"commands\",\n",
" mode=\"overwrite\",\n",
" iam_role=iam_role,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 10 s, sys: 1.14 s, total: 11.2 s\n",
"Wall time: 27.5 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>dt</th>\n",
" <th>element</th>\n",
" <th>value</th>\n",
" <th>m_flag</th>\n",
" <th>q_flag</th>\n",
" <th>s_flag</th>\n",
" <th>obs_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AG000060590</td>\n",
" <td>1897-01-01</td>\n",
" <td>TMAX</td>\n",
" <td>170</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>E</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AG000060590</td>\n",
" <td>1897-01-01</td>\n",
" <td>PRCP</td>\n",
" <td>0</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>E</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AGE00135039</td>\n",
" <td>1897-01-01</td>\n",
" <td>TMIN</td>\n",
" <td>40</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>E</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AGE00147705</td>\n",
" <td>1897-01-01</td>\n",
" <td>TMAX</td>\n",
" <td>164</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>E</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AGE00147705</td>\n",
" <td>1897-01-01</td>\n",
" <td>PRCP</td>\n",
" <td>0</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>E</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923594</th>\n",
" <td>USW00094967</td>\n",
" <td>1897-12-31</td>\n",
" <td>TMAX</td>\n",
" <td>-144</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>6</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923595</th>\n",
" <td>USW00094967</td>\n",
" <td>1897-12-31</td>\n",
" <td>PRCP</td>\n",
" <td>0</td>\n",
" <td>P</td>\n",
" <td><NA></td>\n",
" <td>6</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923596</th>\n",
" <td>UZM00038457</td>\n",
" <td>1897-12-31</td>\n",
" <td>TMAX</td>\n",
" <td>-49</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>r</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923597</th>\n",
" <td>UZM00038457</td>\n",
" <td>1897-12-31</td>\n",
" <td>PRCP</td>\n",
" <td>4</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>r</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3923598</th>\n",
" <td>UZM00038618</td>\n",
" <td>1897-12-31</td>\n",
" <td>PRCP</td>\n",
" <td>66</td>\n",
" <td><NA></td>\n",
" <td><NA></td>\n",
" <td>r</td>\n",
" <td><NA></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>7847198 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" id dt element value m_flag q_flag s_flag obs_time\n",
"0 AG000060590 1897-01-01 TMAX 170 <NA> <NA> E <NA>\n",
"1 AG000060590 1897-01-01 PRCP 0 <NA> <NA> E <NA>\n",
"2 AGE00135039 1897-01-01 TMIN 40 <NA> <NA> E <NA>\n",
"3 AGE00147705 1897-01-01 TMAX 164 <NA> <NA> E <NA>\n",
"4 AGE00147705 1897-01-01 PRCP 0 <NA> <NA> E <NA>\n",
"... ... ... ... ... ... ... ... ...\n",
"3923594 USW00094967 1897-12-31 TMAX -144 <NA> <NA> 6 <NA>\n",
"3923595 USW00094967 1897-12-31 PRCP 0 P <NA> 6 <NA>\n",
"3923596 UZM00038457 1897-12-31 TMAX -49 <NA> <NA> r <NA>\n",
"3923597 UZM00038457 1897-12-31 PRCP 4 <NA> <NA> r <NA>\n",
"3923598 UZM00038618 1897-12-31 PRCP 66 <NA> <NA> r <NA>\n",
"\n",
"[7847198 rows x 8 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"wr.redshift.unload(\n",
" sql=\"SELECT * FROM public.commands\",\n",
" con=con,\n",
" iam_role=iam_role,\n",
" path=path,\n",
" keep_files=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"con.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "awswrangler-v9JnknIF-py3.8",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5 (default, Apr 13 2022, 19:37:23) \n[Clang 13.0.0 (clang-1300.0.27.3)]"
},
"vscode": {
"interpreter": {
"hash": "83297b058d59ee0acd247586c837429190a8258f15c0eea6234359f5557dde51"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}